Large Sample Comparison Results#

In this section we compare the different distribution estimation methods on a large sample of data and view the results from a wide range of evaluation metrics. The methods are compared by plotting cumulative distribution functions (CDFs) to visualize the behaviour across the full sample, since in estimation in ungauged basins it is important to understand the range of outcomes more broadly. The evaluation metrics include RMSE, RE, NSE, KGE, KLD, and EMD, and these have different emphasis on the underlying distribution characteristics.

import os
import pandas as pd
import numpy as np
from pathlib import Path
import xarray as xr

import json
from multiprocessing import Pool

import geopandas as gpd
from shapely.geometry import Point
import xyzservices.providers as xyz
from scipy.stats import linregress

from bokeh.plotting import figure, show, output_file, save
from bokeh.layouts import gridplot, row, column, layout
from bokeh.transform import factor_cmap, linear_cmap

from bokeh.models import ColumnDataSource, LinearAxis, Range1d, HoverTool, Div
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7, Category20, Bokeh6, Bokeh7, Bokeh8, Greys256, Blues256

from shapely.geometry import Point

from kde_estimator import KDEEstimator
from fdc_estimator_context import FDCEstimationContext
from fdc_data import StationData

import data_processing_functions as dpf

import xyzservices.providers as xyz
tiles = xyz['USGS']['USTopo']

output_notebook()
Loading BokehJS ...
attr_fpath = 'data/catchment_attributes_with_runoff_stats.csv'
attr_df = pd.read_csv(attr_fpath, dtype={'Official_ID': str})
station_ids = sorted(attr_df['official_id'].unique().tolist())

# streamflow folder from (updated) HYSETS
HYSETS_DIR = Path('/home/danbot/code/common_data/HYSETS')
hs_df = pd.read_csv('data/HYSETS_watershed_properties.txt', sep=';')
hs_df = hs_df[hs_df['Official_ID'].isin(station_ids)]
hs_df.head(2)
Watershed_ID Source Name Official_ID Centroid_Lat_deg_N Centroid_Lon_deg_E Drainage_Area_km2 Drainage_Area_GSIM_km2 Flag_GSIM_boundaries Flag_Artificial_Boundaries ... Land_Use_Wetland_frac Land_Use_Water_frac Land_Use_Urban_frac Land_Use_Shrubs_frac Land_Use_Crops_frac Land_Use_Snow_Ice_frac Flag_Land_Use_Extraction Permeability_logk_m2 Porosity_frac Flag_Subsoil_Extraction
846 847 HYDAT CROWSNEST RIVER AT FRANK 05AA008 49.59732 -114.4106 402.6522 NaN 0 0 ... 0.0103 0.0065 0.0328 0.0785 0.0015 0.0002 1 -15.543306 0.170479 1
849 850 HYDAT CASTLE RIVER NEAR BEAVER MINES 05AA022 49.48866 -114.1444 820.6510 NaN 0 0 ... 0.0058 0.0023 0.0105 0.1156 0.0246 0.0000 1 -15.929747 0.150196 1

2 rows × 29 columns

watershed_id_dict = {row['Watershed_ID']: row['Official_ID'] for _, row in hs_df.iterrows()}
# and the inverse
official_id_dict = {row['Official_ID']: row['Watershed_ID'] for _, row in hs_df.iterrows()}
# also for drainage areas
da_dict = {row['Official_ID']: row['Drainage_Area_km2'] for _, row in hs_df.iterrows()}
# retrieve LSTM ensemble predictions
lstm_result_folder = '/home/danbot/code/neuralhydrology/data/ensemble_results_20250514'
lstm_result_files = os.listdir(lstm_result_folder)
lstm_result_stns = [e.split('_')[0] for e in lstm_result_files]

# filter for the common stations between BCUB region and LSTM-compatible (i.e. 1980-)
daymet_concurrent_stations = list(set(station_ids) & set(lstm_result_stns))
# assert '012414900' in daymet_concurrent_stations
print(f'There are {len(daymet_concurrent_stations)} monitored basins concurrent with LSTM ensemble results.')
There are 723 monitored basins concurrent with LSTM ensemble results.
def load_and_filter_hysets_data(station_ids, hs_df):
    hs_df = hs_df[hs_df['Official_ID'].isin(station_ids)]

    # load the updated HYSETS data
    updated_filename = 'HYSETS_2023_update_QC_stations.nc'
    ds = xr.open_dataset(HYSETS_DIR / updated_filename)

    # Get valid IDs as a NumPy array
    selected_ids = hs_df['Watershed_ID'].values

    # Get boolean index where watershedID in selected_set
    # safely access watershedID as a variable first
    ws_ids = ds['watershedID'].data  # or .values if you prefer
    mask = np.isin(ws_ids, selected_ids)

    # Apply mask to data
    ds = ds.sel(watershed=mask)
    # Step 1: Promote 'watershedID' to a coordinate on the 'watershed' dimension
    ds = ds.assign_coords(watershedID=("watershed", ds["watershedID"].data))

    # Step 2: Set 'watershedID' as the index for the 'watershed' dimension
    return ds.set_index(watershed="watershedID")


ds = load_and_filter_hysets_data(station_ids, hs_df)
def retrieve_timeseries_discharge(stn, ds):
    watershed_id = official_id_dict[stn]
    # drainage_area = self.ctx.da_dict[stn]
    # data = self.ctx.data
    df = ds['discharge'].sel(watershed=str(watershed_id)).to_dataframe(name='discharge').reset_index()
    df = df.set_index('time')[['discharge']]
    df.dropna(inplace=True)
    # clip minimum flow to 1e-4
    df['discharge'] = np.clip(df['discharge'], 1e-4, None)
    df.rename(columns={'discharge': stn}, inplace=True)
    df[f'{stn}_uar'] = 1000 * df[stn] / da_dict[stn]
    return df


def compare_results_and_input(stn, sim_df, ds):
    """Compare the input streamflow timeseries with the observed streamflow timeseries.
    Check that the dates in the output match the common dates between Daymet and the input data.
    """
    input_df = retrieve_timeseries_discharge(stn, ds)
    # clip the 'discharge' column to 1e-4, convert to unit area runoff (L/s/km2), and take the log
    input_df = input_df[input_df.index >= '1980-01-01']

    df = pd.concat([input_df, sim_df], axis=1, join='inner')
    df.dropna(inplace=True)

    df['streamflow_obs'] = np.exp(df['streamflow_obs'])
    sim_cols = [c for c in sim_df.columns if c.startswith('streamflow_sim')]
    df[sim_cols] = np.exp(df[sim_cols])
    # assert that the 'log_obs' and the 'streamflow_obs' columns are approximately equal

    # set tolerance in the order of 1 L/s/km2
    if not np.allclose(df[f'{stn}_uar'], df['streamflow_obs'], atol=1): 
        max_diff = np.abs(df[f'{stn}_uar'] - df['streamflow_obs']).max()
        print(f'Warning: {stn} has a max difference of {max_diff:.2f} between the input and output streamflow timeseries.')
        # find the dates around the max difference index
        # diff_index = np.abs(df['uar'] - df['streamflow_obs']).idxmax()
        # print(diff_index)
        # view 5 before and 5 after the max diff index
        # print(df.loc[diff_index - pd.Timedelta(days=5):diff_index + pd.Timedelta(days=5), ['uar', 'streamflow_obs']].head(10))
        
    return df

def process_lstm_station(args):
    stn, folder, ds = args
    try:
        fpath = os.path.join(folder, f'{stn}_ensemble.csv')
        df = pd.read_csv(fpath)
        df.rename(columns={'Unnamed: 0': 'time'}, inplace=True)
        df['time'] = pd.to_datetime(df['time'])
        df.set_index('time', inplace=True)
        result = compare_results_and_input(stn, df, ds)
        return stn, result
    except Exception as e:
        print(f"Failed to process {stn}: {e}")
        return stn, None
# retrieve LSTM ensemble predictions
# filter for the common stations
common_stations = list(set(station_ids) & set(lstm_result_stns))
print(f'There are {len(common_stations)} monitored basins with LSTM ensemble results.')
attr_df = attr_df[attr_df['official_id'].isin(common_stations)]

# args_list = [(stn, lstm_result_folder, ds) for stn in common_stations]

# with Pool() as pool:
#     lstm_results = dict(pool.map(process_lstm_station, args_list))
There are 723 monitored basins with LSTM ensemble results.
# load the predicted parameter results
target_cols = [
    'uar_mean_mean_predicted', 'uar_std_mean_predicted', 'uar_median_mean_predicted', 'uar_mad_mean_predicted',
    'log_uar_mean_mean_predicted', 'log_uar_std_mean_predicted', 'log_uar_median_mean_predicted', 'log_uar_mad_mean_predicted',
]

parameter_prediction_results_folder = 'data/results/parameter_prediction_results'
predicted_params_fpath   = os.path.join(parameter_prediction_results_folder, 'mean_parameter_predictions.csv')
stats = pd.read_csv(predicted_params_fpath, index_col=['official_id'], dtype={'official_id': str})
stats.head()
stats.columns = ['_'.join(c.split('_')) for c in stats.columns]
print(f' Loaded {len(stats)} stations with predicted parameters from {predicted_params_fpath}')

    
 Loaded 1098 stations with predicted parameters from data/results/parameter_prediction_results/mean_parameter_predictions.csv
plots = []
m1s = ['log_uar_mean_actual', 'log_uar_std_actual']
m2s = ['log_uar_median_actual', 'log_uar_mad_actual']
for m1, s1 in [m1s, m2s]:
    p = figure(title=f'Observed {m1} vs. {s1} over (N={len(stats)})', width=600, height=350)
    slope, intercept, r_value, p_value, std_err = linregress(stats[m1], stats[s1])
    p.scatter(stats[m1], stats[s1], size=10, color='green', alpha=0.5, legend_label='Observed')
    x = np.linspace(stats[m1].min(), stats[m1].max(), 100)
    p.line(x, slope * x + intercept, color='red', legend_label=f'Y={slope:.2f}x + {intercept:.2f} (R²={r_value**2:.2f})', line_width=2)
    p.xaxis.axis_label = f'Log {m1.split('_')[2]} unit area runoff (L/s/km²)'
    p.yaxis.axis_label = f'Log {s1.split('_')[2]} of unit area runoff (L/s/km²)'
    # p.legend.location = 'top_left'
    p = dpf.format_fig_fonts(p,font_size=14)
    plots.append(p)
lt = column(plots)
show(lt)
from scipy.spatial import cKDTree
from sklearn.preprocessing import StandardScaler

centroids = [Point(e['centroid_lon_deg_e'], e['centroid_lat_deg_n']) for e in attr_df.to_dict('records')]
pts_df = gpd.GeoDataFrame({'geometry': centroids}, crs='EPSG:4326')
pts_df.to_crs('EPSG:3005', inplace=True)

coords = np.array([[geom.x, geom.y] for geom in pts_df.geometry.centroid])
stn_tree = cKDTree(coords)

# Create mapping from official_id to index
id_to_index = {oid: i for i, oid in enumerate(attr_df["official_id"])}
index_to_id = {i: oid for oid, i in id_to_index.items()}  # Reverse mapping

# Extract values (excluding 'official_id' since it's categorical)
attribute_columns = ['log_drainage_area_km2', 'elevation_m', 'prcp', 'tmean', 'swe',
                     'centroid_lon_deg_e', 'centroid_lat_deg_n', 'land_use_forest_frac_2010', 
                    #  'land_use_snow_ice_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
                      ]
attr_df['tmean'] = (attr_df['tmin'] + attr_df['tmax']) / 2.0
# attr_gdf['log_drainage_area_km2'] = np.log(attr_df['drainage_area_km2'])

attr_values = attr_df[attribute_columns].to_numpy()
scaler = StandardScaler()
normalized_attr_values = scaler.fit_transform(attr_values)
# Convert normalized distances back to original units
std_devs_attrs = scaler.scale_  # Standard deviation of each feature
attr_tree = cKDTree(normalized_attr_values)
prior = 1
divergence_measure = 'DKL'
divergence_measure = 'EMD'
create_and_save_plots = False
estimate_by_regression = False
process_LSTM_results = True
result_rev_date = '20250412'

Notes#

Nuance of “concurrent” kNN: k or not k?#

When using kNN derived from sparse monitoring networks with inconsistent coverage, the “k” isn’t exactly “k”. At the timestep level, addressing gaps in data yields different interpretations of k, where the simplest interpretation is that it represents the number of independent monitoring locations used to generate an ensemble simulation of an unmonitored location. Werstuck and Coulibaly (2018) describe infilling data gaps with kNN, effectively a nested kNN, which appears most consistent with the “dynamic k” described below, but it is not the same, rather the infilled data are a temporal mean which suppresses the variability of the ensemble. Below we describe several variants that address data gaps in kNN selection, and how they relate to the concept of k.:

  • Maximum k: The maximum number of stations that can be used to generate an ensemble simulation. In the event of missing observations at one or more k, the effective number of stations is less k. Overall, \(k_\text{actual} \leq k\). Given periods of concurrent gaps, the ensemble simulation could effectively be much lower than k, though it must be at least 1.

  • Strict k: The number of stations used to generate an ensemble simulation is strictly k. This method is related to the set-cover problem, where the goal is to select a subset of stations that maximizes the intersection of their data availability over a specified time period. The problem is NP-hard and requires a greedy or approximate subset selection strategy when exhaustively testing all combinations becomes computationally infeasible. I set a lenient minimum concurrent period (5 complete periods of 12 consecutive complete months (minimum 10 days per month)) to avoid expensive computation. This is more truly “k-nearest”, but it achieves this by searching further away in the network and ignoring potentially more relevant information if it is not concurrent with all records – the higher the k, the greater chance of misalignment of records.

  • Effective k: The number of stations used to generate the ensemble simulation increases until average k observations per timestep. In the event of missing observations at one or more k, more (less similar, more distant) stations must be incorporated to yield an average of k ensemble members per timestep. Overall, \(k_\text{actual} \geq k\). This requires generating weights per timestep such that they sum to 1 where the set comprising k are not constant.

  • Dynamic k: The number of stations overall to generate an ensemble simulation varies, but we guarantee that the number of stations used to generate the simulation is k at each timestep. Here k is really fixed but the stations may vary across timesteps.

Results#

def load_results(args):
    """Load FDC estimation results for a single station and method."""
    stn, result_folder, method = args
    fpath = Path(result_folder) / method / f"{stn}_fdc_results.json"
    with open(fpath) as f:
        data = json.load(f)
        result_list = [pd.DataFrame({'Official_ID': stn, 'Label': label,
            'KLD': d['eval'].get('kld'), 'EMD': d['eval'].get('emd'), 'RMSE': d['eval'].get('rmse'),
            'RE': d['eval'].get('relative_error'), 'NSE': d['eval'].get('nse'), 'KGE': d['eval'].get('kge'),
        }, index=[0]) for label, d in data.items()]
        df = pd.concat(result_list)
        df.reset_index(drop=True, inplace=True)
        return df


results_dfs = {}
sub_folder = 'knn' 
results_folder = '/media/danbot/Samsung_T5/fdc_estimation_results_1980/'
results_folder = 'data/results/fdc_estimation_results'
completed_stns = [c.split('_')[0] for c in os.listdir(os.path.join(results_folder, sub_folder))]
print(f'Found {len(set(completed_stns))} completed stations in {sub_folder} results folder.')

for method in ['parametric', 'lstm', 'knn']:
    print(f'   Loading {method} results')
    method_results_fpath = os.path.join('data', f'{method}_all_results.csv')
    if os.path.exists(method_results_fpath):
        results_dfs[method] = pd.read_csv(method_results_fpath, dtype={'Official_ID': str})
        print(f'   Loaded {len(results_dfs[method])} {method} results from {method_results_fpath}')
    else:
        print(f'   {method} results not found in {method_results_fpath}, loading from individual station files...')
        args = [(stn, results_folder, method) for stn in completed_stns]
        with Pool() as pool:
            results_list = pool.map(load_results, args)

        foo = pd.concat(results_list, ignore_index=True)
        bad_dkl = foo[foo['KLD'].isna() | (foo['KLD'] < 0)].copy()
        if not bad_dkl.empty:
            print(f'Warning: {len(bad_dkl)} {method} rows with NaN or negative DKL values.')
            bad_stns = bad_dkl['Official_ID'].values
            raise Exception(f'Results have {len(bad_stns)} NaN or negative DKL values: {bad_stns}')
        method_results = pd.concat(results_list, ignore_index=True)
        results_dfs[method] = method_results
        print(f'   Loaded {int(len(results_dfs[method])/len(set(completed_stns)))} station results for {method} results')
        method_results.to_csv(method_results_fpath, index=False)
Found 723 completed stations in knn results folder.
   Loading parametric results
   Loaded 2892 parametric results from data/parametric_all_results.csv
   Loading lstm results
   Loaded 1446 lstm results from data/lstm_all_results.csv
   Loading knn results
   Loaded 57840 knn results from data/knn_all_results.csv
# count the number of nan values in each column of results_dfs['parametric']
nan_counts = results_dfs['parametric'].isna().sum()
print("NaN counts in parametric results:")
print(nan_counts[nan_counts > 0])
# results_dfs['parametric'].head()
NaN counts in parametric results:
Series([], dtype: int64)
# add the unrestrained, 1950-present kNN results
# pre_1950_results_folder = 'data/fdc_estimation_results_1950/'
# results_list = parallel_load_fdc_estimation_results(completed_stns, pre_1950_results_folder, 'knn')
# results_dfs['knn_1950'] = pd.concat(results_list, ignore_index=True)
def split_knn_label_col(df):
    # Split the string column
    # Determine format based on length
    print(df.columns)
    split_labels = df['Label'].str.split('_')
    df['n_parts'] = split_labels.str.len()

    assert len(set(df['n_parts'])) == 1, "Not all labels have the same number of parts"

    # Define expected column structures
    # format_a_cols = ["Official_ID", "k", "NN", 'concurrent', 'tree_type', 'dist', 'weighting', 'ensemble_method']
    format_cols = ["Official_ID", "k", "NN", 'tree_type', 'dist', 'ensemble_weight', 'ensemble_method']

    # Subset by format
    df_a = df[df['n_parts'] == len(format_cols)].copy()

    # Split and join with suffix to avoid conflicts
    df_a_split = df_a['Label'].str.split('_', expand=True)
    df_a_split.columns = format_cols
    merged = pd.concat([df_a.reset_index(drop=True), df_a_split.reset_index(drop=True)], axis=1)

    # Drop duplicates (if any) and update
    merged.drop(columns=['NN', 'dist', 'n_parts', 'minYears', 'minOverlapPct'], errors='ignore', inplace=True)
    merged = merged.loc[:, ~merged.columns.duplicated()]
    return merged
parametric_targets = list(set(results_dfs['parametric']['Label'].values))
results_dfs['knn'] = split_knn_label_col(results_dfs['knn'])
# results_dfs['knn_1950'] = split_knn_label_col(results_dfs['knn_1950'])
# for k in results_dfs.keys():
#     print(k, len(results_dfs[k]))
Index(['Official_ID', 'Label', 'KLD', 'EMD', 'RMSE', 'RE', 'NSE', 'KGE'], dtype='object')
# create a dict to map the line colours and line types to the various approximation methods
method_dict = {
    'LN_predicted_log_params': {'color': 'dodgerblue', 'line_type': 'solid', 'label': 'Log Parametric'},
    'MOM_predicted_params': {'color': 'dodgerblue', 'line_type': 'dashed', 'label': 'MOM parametric'},
    'lstm_ensemble_time': {'color': 'green', 'line_type': 'solid', 'label': 'LSTM time ensemble'},
    'lstm_ensemble_freq': {'color': 'green', 'line_type': 'dashed', 'label': 'LSTM freq ensemble'},
}
# add the various kNN methods.
line_types = ['solid', 'dashed', 'dotdash', 'dotted']
for k in range(1, 11):
    c = Sunset10[k - 1]
    for wm in ['ID1', 'ID2']:
        for dist_type in ['spatial', 'attribute']:
            for min_overlap in [0, 50, 100]:#[0, 25, 50, 75, 100]:
                ln = 0
                
                ensemble_type = 'timeEnsemble'
                # label = f'{k}_NN_{wm}_{dist_type}_dist_{min_overlap}_{ensemble_type}'
                # display_label = f'{k}NN {wm} {dist_type} {time_type} {knn_method} time'
                # method_dict[label] = {'color': c, 'line_type': line_types[ln], 'label': display_label}
                
                # ensemble_type = 'freqEnsemble'
                # label2 = f'{k}_NN_{wm}_{dist_type}_dist_{time_type}_{ensemble_type}'
                # # 1_NN_EW_spatial_dist_concurrent_freqEnsemble'
                # display_label2 = f'{k}NN {wm} {dist_type} {time_type} freq.'
                # method_dict[label2] = {'color': c, 'line_type': line_types[ln], 'label': display_label2}
                # ln += 1

Create a narrative story with the results. We started by asking about the “simplest” approximation of an FDC, the parametric approximation from predicted values.

  1. First, let’s review the predicted and “observed” mean, standard deviation, log-mean, and log-standard deviation predicted from catchment attributes. This will give us a first clue of what to expect as far as which approach yields the better approximation of the FDC. Since we’re predicting these values using an objective function that minimizes the difference between predicted and observed values, we should first look at the distribution of the target values, since some metrics are sensitive to skewness and outliers.

hist_plots, plots = [], []
for target in target_cols:
    print(target)
    b = '_'.join(target.split('_')[:-2])
    pred, obs = stats[f'{b}_mean_predicted'].values, stats[f'{b}_actual'].values
    # get the regression results from scipy.stats.linregress()
    slope, intercept, r_value, p_value, std_err = linregress(obs, pred)
    f = figure()
    x = np.linspace(obs.min(), obs.max(), 100)
    y = slope * x + intercept    
    
    f.scatter(obs, pred, size=5, color='black', alpha=0.5, legend_label=f'{target}')
    f.line(x, y, line_color='firebrick', line_width=2, legend_label=f'{target} (R²={r_value**2:.2f})')
    f.legend.location = 'bottom_right'
    f.yaxis.axis_label = f'Predicted {target} (L/s/km²)'
    f.xaxis.axis_label = f'Observed {target} (L/s/km²)'
    
    # Create histogram data
    hist, edges = np.histogram(obs, bins=20)
    hist = hist / hist.sum() * 100  # Normalize to percentage

    # Prepare data for quad glyph: left and right edges of bins\
    hist_source = ColumnDataSource(data=dict(
        left=edges[:-1],
        right=edges[1:],
        top=hist,
        bottom=np.zeros_like(hist)
    ))

    # Create figure
    hist_plot = figure(width=450, height=100, x_axis_label=target, y_axis_label='Percentage')

    # Plot using quad
    hist_plot.quad(
        top='top', bottom='bottom', left='left', right='right',
        source=hist_source,
        fill_color='dodgerblue', fill_alpha=0.5, line_color='black'
    )

    # Optional styling
    hist_plot.xaxis.axis_label = target
    hist_plot.yaxis.axis_label = 'P(x)'
    f.legend.background_fill_alpha = 0.3
    f = dpf.format_fig_fonts(f, font_size=15)
    hist_plots.append(hist_plot)
    plots.append(f)
uar_mean_mean_predicted
uar_std_mean_predicted
uar_median_mean_predicted
uar_mad_mean_predicted
log_uar_mean_mean_predicted
log_uar_std_mean_predicted
log_uar_median_mean_predicted
log_uar_mad_mean_predicted
lt = gridplot(hist_plots, ncols=2, width=450, height=300)
# show(lt)
lt = gridplot(plots, ncols=2, width=450, height=300)
# show(lt)

From the plots above, the coefficient of determination around 0.8 for the mean and standard deviation show that these are reasonably well predicted from catchment attributes. The log-mean is also fairly predictable, but the log-standard deviation is not. This might lead us to expect that the FDC approximation based on the predicted log-mean and log-stdev might be worse than the approximation based on the predicted mean and stdev. However, the method of moments must still be applied to the mean and stdev to get the log-normal parameters.

The question is how the structural error introduced by the method of moments interacts with the parametric model, and how the log-transform affects the distribution of the target variables and the objective function of the predictive model by extension. While the log-transform doesn’t change the rank of the values, it still changes the emphasis on the objective function of the predictive model, since the model’s performance can vary significantly based on the distribution of the input data. By de-emphasizing the tails by the log-transform, we are effectively changing the model’s focus to the central tendency of the data. (should we consider quantile regression or another form of robust regression)?

Next, take a look at the distribution of the FDCs for these two approaches.

  1. Plot the distribution of FDCs to demonstrate the first comparison of the parametric approximations: Is the parametric approximation better based on predicted mean and variance via the method of moments, or is it better based on the predicted log-mean and log-standard deviation which avoids the method of moments.

def compute_empirical_cdf(data):
    """Compute the empirical CDF of the data.
    Address missing, NaN, and non-finite values by removing them.
    """
    # data = data[~np.isnan(data) & np.isfinite(data)]  # remove to see the CDFs without filtering
    
    sorted_data = np.sort(data)
    n = len(sorted_data)
    cdf = np.arange(1, n + 1) / n
    return sorted_data, cdf
fdc_df = pd.concat([results_dfs['parametric'], results_dfs['lstm']], axis=0)
# fdc_df = results_dfs['parametric'].copy()
np.unique(fdc_df['Label'].values)
results_dfs['parametric'].keys()
print(len(fdc_df))
fdc_df.head()
4338
Official_ID Label KLD EMD RMSE RE NSE KGE
0 08EE008 MLE 0.095303 3.1684 5.903078 0.175767 0.872205 0.779896
1 08EE008 PredictedLog 0.124748 7.2130 14.582935 0.423448 0.220087 0.153269
2 08EE008 PredictedMOM 0.365106 7.3746 9.449604 1.060105 0.672521 0.382503
3 08EE008 RandomDraw 0.233349 6.7769 10.032229 0.474453 0.630894 0.354130
4 09AA013 MLE 0.179145 2.6509 3.431122 0.181807 0.954041 0.960200
def get_result_and_ids(label, metric):
    data = fdc_df[fdc_df['Label'] == label].copy()
    data = data.dropna(subset=[metric])
    values = data[metric].values
    if metric in ['NSE', 'KGE']:
        # for NSE and KGE, we want to plot the upper bound as the maximum value
        values = 1 - values
    return values, data['Official_ID']

def plot_parametric_bounds(f, dm, lb_label='MLE', rb_label='RandomDraw', label='LogNorm bounds', color='black'):
    # plot an HArea describing the MLE (upper) and Random (lower) bounds
    upper_bound = fdc_df[fdc_df['Label'] == lb_label][dm].values
    lower_bound = fdc_df[fdc_df['Label'] == rb_label][dm].values
    if dm in ['NSE', 'KGE']:
        # for NSE and KGE, we want to plot the upper bound as the maximum value
        upper_bound = 1 - upper_bound
        lower_bound = 1 - lower_bound
    x_upper, y_upper = compute_empirical_cdf(upper_bound)
    x_lower, y_lower = compute_empirical_cdf(lower_bound)
    f.harea(x1=x_lower, x2=x_upper, y=y_upper, fill_color=color, fill_alpha=0.3, legend_label=label)
    return f

def get_knn_group_results(tree_type='attribute', ensemble_type='freqEnsemble', weighting='ID2', k=7, which_set='knn'):
    data = results_dfs[which_set].copy()
    data = data[data['tree_type'] == tree_type]
    data = data[data['ensemble_method'] == ensemble_type]
    data = data[data['ensemble_weight'] == weighting]
    data = data[data['k'] == str(k)]
    return data
fdc_df.head()
list(set(fdc_df['Label'].values))
['MLE', 'RandomDraw', 'PredictedMOM', 'time', 'frequency', 'PredictedLog']
axis_labels = [
    r'$$D_\text{KL}(\text{PDF}_\text{Baseline}||\text{PDF}_\text{Est.})$$', 
    r'$$D_\text{EMD}(\text{PDF}_\text{Baseline}||\text{PDF}_\text{Est.})$$',
    r'$$D_\text{RMSE}(\text{PDF}_\text{Baseline}||\text{PDF}_\text{Est.})$$',
    r'$$D_\text{RE}(\text{PDF}_\text{Baseline}||\text{PDF}_\text{Est.})$$',
    r'$$D_\text{NSE}(\text{PDF}_\text{Baseline}||\text{PDF}_\text{Est.})$$',
    r'$$D_\text{KGE}(\text{PDF}_\text{Baseline}||\text{PDF}_\text{Est.})$$'
]

lines = ['solid', 'dotted',  'dashed', 'solid']
clrs = ['black', 'black', 'red', 'red']
plots = []
# labels = [ 'PredictedLog', 'PredictedMOM', 'Time', 'Frequency']
# legend_labels = ['Pred. LN', 'Pred. MOM LN', 'LSTM Time', 'LSTM Frequency']
main_result_vals = {}

print(f"N={len(set(fdc_df['Official_ID']))}")
for dm, x_label  in zip(['KLD', 'EMD', 'RMSE', 'RE', 'NSE', 'KGE'], axis_labels):
    f = figure(x_axis_type='log', title=f'FDC Evaluation by {dm} (N={len(fdc_df)})', width=600, height=400)
    # f = figure(title=f'FDC Evaluation by {dm} (N={len(fdc_df)})', width=600, height=400)
    
    # f = plot_parametric_bounds(f, dm, lb_label='ObsLMomentsGEV', rb_label='LMomentsGEVRandomDraw', label='GEV bounds', color='firebrick')
    f = plot_parametric_bounds(f, dm)
    
    # plot the Parametric results
    label = f'PredictedMOM'
    data, ids = get_result_and_ids(label, dm)
    print(f'PredictedLog {dm} results: {len(data)}')
    main_result_vals['LN MoM' + f' {dm}'] = pd.DataFrame({'ids': ids, 'values': data})
    x, y = compute_empirical_cdf(data)
    f.line(x, y, line_width=2, line_dash='dashed', color='black', legend_label='MoM LogNorm')
    label = f'PredictedLog'
    data, ids = get_result_and_ids(label, dm)
    main_result_vals['LN Direct' + f' {dm}'] = pd.DataFrame({'ids': ids, 'values': data})
    x, y = compute_empirical_cdf(data)
    f.line(x, y, line_width=2, line_dash='solid', color='black', legend_label='LogNorm')
    print(f'PredictedLog {dm} results: {len(data)}')
    # label = f'PredictedLMomentsGEV'
    # data, ids = get_result_and_ids(label, dm)
    # main_result_vals[label + f'_{dm}'] = pd.DataFrame({'ids': ids, 'values': data})
    # x, y = compute_empirical_cdf(data)
    # f.line(x, y, line_width=2, line_dash='solid', color='firebrick', legend_label='Pred. LMoments GEV')
    # label = f'LMomentsGEV'
    # data, ids = get_result_and_ids(label, dm)
    # print(data[:10])
    # main_result_vals[label + f'_{dm}'] = pd.DataFrame({'ids': ids, 'values': data})
    # x, y = compute_empirical_cdf(data)
    # f.line(x, y, line_width=2, line_dash='dotted', color='firebrick', legend_label='Lmoments GEV')

    # plot the 'best of' KNN 1: better best
    low_k, high_k = 3, 9
    select_knn = get_knn_group_results(k=low_k)
    knn_vals = select_knn[dm].values
    if dm in ['NSE', 'KGE']:
        knn_vals = 1 - select_knn[dm].values  # for NSE and KGE, we want to plot the upper bound as the maximum value
    kx, ky = compute_empirical_cdf(knn_vals)
    # count nan in kx and ky
    n_nan_kx = np.sum(np.isnan(kx))
    n_nan_ky = np.sum(np.isnan(ky))
    if n_nan_kx > 0 or n_nan_ky > 0:
        print(f'Warning: {n_nan_kx} NaN values in kx and {n_nan_ky} NaN values in ky for {low_k} kNN {dm}')
        # remove NaN values from kx and ky
        valid_indices = ~np.isnan(kx) & ~np.isnan(ky)
        kx, ky = kx[valid_indices], ky[valid_indices]
    print(f'kNN {low_k} {dm} results: {len(select_knn)}')
    main_result_vals[f'{low_k} kNN {dm}'] = pd.DataFrame({'ids': select_knn['Official_ID'].values, 'values': knn_vals})
    f.line(kx, ky, line_width=3, line_dash='solid', color='orange', legend_label=f'{low_k}-NN')
    # plot the 'best of' KNN 2: lower risk
    select_knn = get_knn_group_results(k=high_k)
    knn_vals = select_knn[dm].values
    if dm in ['NSE', 'KGE']:
        knn_vals = 1 - select_knn[dm].values  # for NSE and KGE, we want to plot the upper bound as the maximum value
    kx, ky = compute_empirical_cdf(knn_vals)
    print(f'kNN {high_k} {dm} results: {len(select_knn)}')
    main_result_vals[f'{high_k} kNN {dm}'] = pd.DataFrame({'ids': select_knn['Official_ID'].values, 'values': knn_vals})
    f.line(kx, ky, line_width=3, line_dash='dashed', color='orange', legend_label=f'{high_k}-NN')

    # get 1950-present knn results
    # select_knn = get_knn_group_results(which_set='knn_1950', k=low_k)
    # kx, ky = compute_empirical_cdf(select_knn[dm].values)
    # main_result_vals[f'kNN1950_{dm}'] = pd.DataFrame({'ids': select_knn['Official_ID'].values, 'values': select_knn[dm].values})
    # f.line(kx, ky, line_width=3, line_dash='solid', color='red', legend_label=f'{low_k}-NN (1950-)')
    # select_knn = get_knn_group_results(which_set='knn_1950', k=high_k)
    # kx, ky = compute_empirical_cdf(select_knn[dm].values)
    # main_result_vals[f'kNN1950_{dm}'] = pd.DataFrame({'ids': select_knn['Official_ID'].values, 'values': select_knn[dm].values})
    # f.line(kx, ky, line_width=3, line_dash='dashed', color='red', legend_label=f'{high_k}-NN (1950-)')

    # # plot the 'worst' Frequency KNN
    # select_knn = get_knn_group_results(tree_type='attribute', min_overlap='100', ensemble_type='freqEnsemble', weighting='ID1', k=4)
    # kx, ky = compute_empirical_cdf(select_knn[dm].values)
    # main_result_vals[f'kNN1980_{dm}'] = pd.DataFrame({'ids': select_knn['Official_ID'].values, 'values': select_knn[dm].values})
    # f.line(kx, ky, line_width=4, line_dash='dotted', color='orange', legend_label='KNN concurrent (freq)')

    # plot the 'worst' Time KNN
    # select_knn = get_knn_group_results(tree_type='spatial', min_overlap='100', ensemble_type='timeEnsemble', weighting='ID1', k=4)
    # kx, ky = compute_empirical_cdf(select_knn[dm].values)
    # main_result_vals[f'kNN1980_{dm}'] = pd.DataFrame({'ids': select_knn['Official_ID'].values, 'values': select_knn[dm].values})
    # f.line(kx, ky, line_width=4, line_dash='dashed', color='orange', legend_label='KNN concurrent (time)')

    # plot the async KNN
    # select_knn = get_knn_group_results(min_overlap=0)
    # main_result_vals[f'kNN1950_{dm}'] = pd.DataFrame({'ids': select_knn['Official_ID'].values, 'values': select_knn[dm].values})
    # kx, ky = compute_empirical_cdf(select_knn[dm].values)
    # f.line(kx, ky, line_width=3, line_dash='dashed', color='orange', legend_label='KNN 1950-')

    # # plot the LSTM results
    lstm_time = fdc_df[fdc_df['Label'] == 'time'][dm].values
    if dm in ['NSE', 'KGE']:
        # for NSE and KGE, we want to plot the upper bound as the maximum value
        lstm_time = 1 - lstm_time
    lstm_ids = fdc_df[fdc_df['Label'] == 'time']['Official_ID'].values
    # print(len(lstm_time), len(lstm_ids))
    x_lstm, y_lstm = compute_empirical_cdf(lstm_time)
    main_result_vals[f'LSTM time {dm}'] = pd.DataFrame({'values': lstm_time, 'ids': lstm_ids})
    f.line(x_lstm, y_lstm, line_width=3, line_dash='dotted', color='green', legend_label='LSTM Time')
    lstm_freq = fdc_df[fdc_df['Label'] == 'frequency'][dm].values
    if dm in ['NSE', 'KGE']:
        # for NSE and KGE, we want to plot the upper bound as the maximum value
        lstm_freq = 1 - lstm_freq
    lstm_ids = fdc_df[fdc_df['Label'] == 'frequency']['Official_ID'].values
    main_result_vals[f'LSTM freq {dm}'] = pd.DataFrame({'values': lstm_freq, 'ids': lstm_ids})
    x_lstm, y_lstm = compute_empirical_cdf(lstm_freq)
    f.line(x_lstm, y_lstm, line_width=2, line_dash='solid', color='green', legend_label='LSTM Frequency')

    # data = ensemble_df[f'{dm}_e2'].values
    # x, y = compute_empirical_cdf(data)
    # f.line(x, y, line_width=3, line_dash='solid', color='magenta', legend_label='LSTM-KNN')
    # data = ensemble_df[f'{dm}_e3'].values
    # x, y = compute_empirical_cdf(data)
    # f.line(x, y, line_width=3, line_dash='dotted', color='magenta', legend_label='LSTM-KNN-LN')

    label = f'MLE'
    data, ids = get_result_and_ids(label, dm)
    main_result_vals[label + f'_{dm}'] = pd.DataFrame({'ids': ids, 'values': data})

    f.xaxis.axis_label = x_label
    if dm == 'DKL':
        f.xaxis.axis_label = 'KL Divergence [bits/sample]'
    else:
        # f.xaxis.axis_label = f'{EMD [L/s/km²]'
        f.xaxis.axis_label = x_label
    # f.yaxis.axis_label = '$$P(x)$$'
    f.yaxis.axis_label = r'$$P(X\leq x)$$'
    f.legend.location = 'top_left'
    f.legend.location = 'bottom_right'
    f.legend.background_fill_alpha = 0.0
    f.legend.click_policy = 'hide'
    # # hide grid lines
    # f.grid.grid_line_color = None
    f = dpf.format_fig_fonts(f, font_size=14)
    plots.append(f)
lt = gridplot(plots, ncols=2, width=500, height=475)
show(lt)
N=723
PredictedLog KLD results: 723
PredictedLog KLD results: 723
kNN 3 KLD results: 723
kNN 9 KLD results: 723
PredictedLog EMD results: 723
PredictedLog EMD results: 723
kNN 3 EMD results: 723
kNN 9 EMD results: 723
PredictedLog RMSE results: 723
PredictedLog RMSE results: 723
kNN 3 RMSE results: 723
kNN 9 RMSE results: 723
PredictedLog RE results: 723
PredictedLog RE results: 723
kNN 3 RE results: 723
kNN 9 RE results: 723
PredictedLog NSE results: 723
PredictedLog NSE results: 723
kNN 3 NSE results: 723
kNN 9 NSE results: 723
PredictedLog KGE results: 723
PredictedLog KGE results: 723
kNN 3 KGE results: 723
kNN 9 KGE results: 723

Add NSE over some range of quantiles or over some range of values#

Use a precedent from the literature

bottom_20pct_ids = {}
for m in main_result_vals.keys():
    df = main_result_vals[m].copy()
    df.sort_values('values', ascending=False, inplace=True)
    # get the ids of the highest 20% of values (worst scores)
    bottom_20pct = df.iloc[:int(len(df) * 0.2)]
    bottom_20pct_ids[m] = pd.DataFrame({'ids': bottom_20pct['ids'].values, 'values': bottom_20pct['values'].values})

Note in the comparison above the DKL metrics have roughly 10% +inf values because despite the tails being well-described, the parametric approach still yields underspecified models of the observations! Note that the two methods are not directly comparable, since the DKL is in units of bits while the EMD is in units of the original data. We are not comparing these two divergence measures against each other, rather we are making a comparison between the two methods of generating LN parameters. On this basis, in terms of both DKL and EMD metrics, the log-mean and log-stdev approach is expected to yield closer approximations of the FDCs than the mean and stdev approach. It is not entirely clear what contributes more to this outcome, the normalization of variance or the structural error introduced by the method of moments.

Rank Correlation#

One interesting characteristics of the results is that about 20% of the sites can’t do better than 0.4 bits/sample entropy, and that the parametric estimation is the best approach for these. One question we can ask about this 20% is whether it’s the same sample across the disparate methods, and we can figure this out directly by checking the size of the common set of ids in the worst (highest KLD/EMD) 20%.

  • are there sites that are difficult to predict, regardless of the method?

  • are there methods that work better on some sites compared to others?

  • are there metrics that work better on some sites compared to others?

def regression_scatter_plots(md, main_result_vals, plot_type='rank'):
    model_set = [e for e in main_result_vals.keys() if e.endswith(f' {md}')]
    model_set = [e for e in model_set if not e.startswith('LN MoM')]
    model_set = [e for e in model_set if not e.startswith('PredictedLMomentsGEV')]
    model_set = [e for e in model_set if 'time' not in e]
    model_set = [e for e in model_set if "MLE" not in e]
    # model_set = [e for e in model_set if '1950' not in e]
    # get unordered pairs of models
    from itertools import combinations
    model_pairs = list(combinations(model_set, 2))
    rank_scatter_plots = []
    model_labels = []
    for m1, m2 in model_pairs:
        if 'kNN' in m1 and 'kNN' in m2:
            continue
        if '3' in m1 or '3' in m2:
            continue
        # get ranking of md values and make a scatter plot
        df1 = main_result_vals[m1].copy()
        df2 = main_result_vals[m2].copy()
        if df1.empty or df2.empty:
            continue
        ascending = False if md in ['NSE', 'KGE'] else True
        df1.sort_values('values', ascending=ascending, inplace=True)
        df2.sort_values('values', ascending=ascending, inplace=True)
        df1['rank'] = np.arange(len(df1)) + 1
        df2['rank'] = np.arange(len(df2)) + 1
        # merge the two dataframes on ids
        merged = pd.merge(df1, df2, on='ids', suffixes=('_1', '_2'))
        # create a scatter plot of the ranks
        p = figure(title=f"", width=350, height=300)
        if plot_type == 'rank':
            p.scatter(merged['rank_1'], merged['rank_2'], size=5)
            slope, intercept, r_value, p_value, std_err = linregress(merged['rank_1'], merged['rank_2'])
            xvals = [1, len(merged)]
            yvals = [1, len(merged)]
        else:
            p.scatter(merged['values_1'], merged['values_2'], size=5)
            slope, intercept, r_value, p_value, std_err = linregress(merged['values_1'], merged['values_2'])
            xvals = [merged['values_1'].min(), merged['values_1'].max()]
            yvals = [slope * x + intercept for x in xvals]

        p.line(xvals, yvals, line_dash='dashed', color='black', line_width=2, legend_label='1:1')
        p.line(xvals, yvals, line_color='red', line_width=2, legend_label=f'R²={r_value**2:.2f}')
        
        if plot_type == 'rank':
            p.xaxis.axis_label = f'Rank of {m1[:-3]}'
            p.yaxis.axis_label = f'Rank of {m2[:-3]}'
        else:
            p.xaxis.axis_label = f'{m1[:-3]} {md}'
            p.yaxis.axis_label = f'{m2[:-3]} {md}'
        if m1 not in model_labels:
            model_labels.append(m1)
        if m2 not in model_labels:
            model_labels.append(m2)
        p.legend.location = 'top_left'
        p.legend.background_fill_alpha = 0.7
        p = dpf.format_fig_fonts(p, font_size=16)
        rank_scatter_plots.append(p)
    return rank_scatter_plots, model_labels
models = ['KLD', 'EMD', 'RMSE', 'RE', 'NSE', 'KGE']
rank_plots = []
for model in models:
    plot_set, labels = regression_scatter_plots(model, main_result_vals, plot_type='value')
    rank_plots += plot_set
show(gridplot(rank_plots, ncols=3, width=350, height=300))

Above: the outliers skew the meaning of the correlation between metrics from different methods. The rank correlation is a better measure of the relationship between the methods, since it is not affected by the outliers.

# create a dataframe with all the model results indexed by station
all_results = []
for m in main_result_vals.keys():
    df = main_result_vals[m].copy()
    df.rename(columns={'values': m}, inplace=True)
    df.set_index('ids', inplace=True)
    all_results.append(df)
all_results_df = pd.concat(all_results, axis=1)
all_results_df.head()
LN MoM KLD LN Direct KLD 3 kNN KLD 9 kNN KLD LSTM time KLD LSTM freq KLD MLE_KLD LN MoM EMD LN Direct EMD 3 kNN EMD ... LSTM time NSE LSTM freq NSE MLE_NSE LN MoM KGE LN Direct KGE 3 kNN KGE 9 kNN KGE LSTM time KGE LSTM freq KGE MLE_KGE
ids
08EE008 0.365106 0.124748 0.182456 0.114087 0.243106 0.086372 0.095303 7.3746 7.2130 4.5444 ... 0.031520 0.072578 0.127795 0.617497 0.846731 0.327706 0.366476 0.133825 0.240938 0.220104
09AA013 0.232371 0.192162 0.476181 0.368469 0.028556 0.058264 0.179145 3.0474 2.5505 7.4596 ... 0.010069 0.027466 0.045959 0.177162 0.046251 0.647212 0.517155 0.039515 0.157734 0.039800
10BC001 0.211743 0.215619 0.064510 0.087384 0.225792 0.167178 0.207875 1.3077 1.5576 1.1845 ... 0.236739 0.187314 0.036835 0.063630 0.174330 0.149343 0.033794 0.483511 0.436281 0.170397
12452500 0.692603 0.811168 1.634531 1.464451 1.260488 0.848393 0.694206 7.6393 7.2498 12.6949 ... 0.486211 0.493930 2.522811 0.512129 0.297114 0.524670 0.581164 0.358038 0.459669 1.470518
08MH076 0.162631 0.047911 0.097241 0.067064 0.109725 0.106879 0.043390 12.2941 17.0506 14.7435 ... 0.108409 0.085322 0.058381 0.310397 0.505925 0.399747 0.137506 0.347832 0.308771 0.217135

5 rows × 42 columns

from bokeh.models import ColumnDataSource, LinearColorMapper, ColorBar, PrintfTickFormatter, FixedTicker
from bokeh.plotting import figure, show
from bokeh.transform import transform
from bokeh.palettes import RdBu

def plot_correlation_regression(df, md, correlation_type='pearson'):
    model_cols = [c for c in df.columns if (c.endswith(f' {md}') and 'time' not in c)]
    model_df = df[model_cols].copy()
    if md in ['NSE', 'KGE']:
        # for NSE and KGE, we want to plot the upper bound as the maximum value
        model_df = 1 - model_df
    corr_df = model_df.corr(method=correlation_type)
    if correlation_type == 'kendall':
        label = 'Kendall Tau Correlation'
    elif correlation_type == 'pearson':
        label = 'Pearson Correlation'
    else:
        raise ValueError(f"Unsupported correlation type: {correlation_type}")
    
    # Melt correlation matrix to long-form
    corr_long = corr_df.reset_index().melt(id_vars='index')
    corr_long.columns = ['x', 'y', 'value']
    # Create source
    source = ColumnDataSource(corr_long)

    # low, high = 0.1, 1.0
    min_val = min(corr_long['value'].min(), 0.1)
    low, high = min_val, 1.0
    n_colors = len(RdBu[10])

    # Bin edges and centers
    bounds = np.linspace(low, high, n_colors + 1)
    centers = 0.5 * (bounds[:-1] + bounds[1:])
    # Define evenly spaced breakpoints
    # tick_vals = np.linspace(low, high, n_colors)

    # Reverse the palette for consistency (if needed)
    palette = list(reversed(RdBu[10]))
    # Optional: format labels as strings with 2 decimals
    labels = [f"{c:.2f}" for c in centers]
    label_overrides = {c: l for c, l in zip(centers, labels)}
    mapper = LinearColorMapper(palette=palette, low=low, high=high)

    p = figure(
        x_range=list(corr_df.columns), y_range=list(reversed(corr_df.columns)),
        x_axis_location="above", width=500, height=400,
        tools="hover", toolbar_location=None, tooltips=[("Model Pair", "@x vs @y"), ("Corr", "@value{0.2f}")]
    )

    p.rect(x="x", y="y", width=1, height=1, source=source,
        fill_color=transform('value', mapper), line_color=None)


    color_bar = ColorBar(
        color_mapper=mapper,
        ticker=FixedTicker(ticks=centers.tolist()),
        formatter=PrintfTickFormatter(format=""),  # will be overridden
        major_label_overrides=label_overrides,
        major_label_text_font_size="10pt",
        label_standoff=6,
        border_line_color=None,
        location=(0, 0),
        title=label,
    )

    p.add_layout(color_bar, 'right')

    # Axis styling
    p.axis.major_label_text_font_size = "10pt"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 0.66
    p = dpf.format_fig_fonts(p, font_size=16)
    return p
p = plot_correlation_regression(all_results_df, md='NSE', correlation_type='pearson')
show(p)
p = plot_correlation_regression(all_results_df, md='EMD', correlation_type='pearson')
show(p)
p = plot_correlation_regression(all_results_df, md='RE', correlation_type='pearson')
show(p)
p = plot_correlation_regression(all_results_df, md='RMSE', correlation_type='pearson')
show(p)
p = plot_correlation_regression(all_results_df, md='NSE', correlation_type='pearson')
show(p)
p = plot_correlation_regression(all_results_df, md='KGE', correlation_type='pearson')
show(p)

Notes on rank correlations:

  • LSTM time vs. frequency doesn’t change affect the rank, it’s nearly 1.

  • kNN 1980- vs. 1950- has next highest rank correlation (0.84) since these are the same method applied to slightly different data.

  • LSTM, kNN, and log-parametric are all similar in correlation rank (~0.6).

  • MOM Parametric and all except log-parametric have about .1 less rank correlation (~0.5).

  • LSTM and MLE have low rank correlation (~0.2).

  • kNN and MLE have lowest rank correlation (~0.1).

The question is what catchments / processes are driving these differences?

We next look at how the absolute rank correlation varies as a function of the predictive performance to answer the question, “What is the relationship between predictive performance and rank correlation?”

kNN - based FDC approximation#

Next we consider the various interpretations of kNN we used to generate the FDC from streamflow observations in the network. The kNN represents a hedging of risk against the criteria used to select the proxy to represent the target location. The “nearest neighbour” constraint represents an assumption that the closest locations are most representative, but the choice of neighbouring set interacts with the secondary constraint of concurrency of records, which is a practical reality of environmental observation. The data concurrency constraint means we discard some of the observations of both the target and the proxy in defining the FDCs, and also discard potential proxies that might be more representative of the target because of a lack of sufficient concurrent record.

The goal is to better understand the trade-off between the selection criteria and the performance of the kNN method under as broad a range of conditions as possible. On the one hand, we can be strict in the temporal sense that it is only valid to compare observations that occurred at the same time, but another interpratation is that a significant proportion of the interannual variability can be captured given sufficient data, and more of the interannual variability can be covered by not requiring concurrency. Strictness in the spatial (neighbour) sense means we take exactly the same k neighbours, but this leads to loss of data because more distant neighbours must be sought to satisfy the “k” contributors constraint, whereas we can interpret it as “take the best information where it’s available, and resort to potentially less relevant contributors if necessary to fill gaps.

The next sets of plots represent different ways to control the preservation of information in the kNN method:

  1. Look across k neighbours for the same method of selecting neighbours (e.g., IDW, CAS, etc.) to see how the kNN method performs as a function of k. This will help us understand the trade-off between the number of neighbours and the performance of the kNN method.

def create_knn_plots(tree_type, ensemble_type, dm, results_df):
    knn_df = results_df['knn'].copy()
    plots = []
    clrs = Sunset10
    for wm in ['ID1', 'ID2']:
        
        data = knn_df[knn_df['ensemble_weight'] == wm].copy()
        data = data[data['ensemble_method'] == ensemble_type]
        data = data[data['tree_type'] == tree_type]  
        
        if data.empty:
            print(f'No data for {tree_type} {ensemble_type} {dm} {wm}')
            continue
        
        if len(plots) > 0:
            f = figure(title=f"{wm} {tree_type} {ensemble_type} (N={len(data)/10 })", x_axis_type='log', width=600, height=450, 
                        x_range=plots[0].x_range, y_range=plots[0].y_range)
        else:
            f = figure(title=f"{wm} {tree_type} {ensemble_type} (N={len(data)/10 })", x_axis_type='log', width=600, height=450)

        f = plot_parametric_bounds(f, dm)
        for i in range(1, 11):
            kd = data[data['k'] == str(i)].copy()
            assert len(kd) == len(completed_stns), f"Expected {len(completed_stns)} rows, got {len(kd)}: {kd.head()}"
            values = kd[dm].values
            if dm in ['NSE', 'KGE']:
                # for NSE and KGE, we want to plot the CDF of the values
                values = 1 - values  # invert the values for NSE and KGE

            x, y = compute_empirical_cdf(values)
            f.line(
                x, y,
                line_color=clrs[i-1],
                line_dash='solid',
                legend_label=f'{i}NN',
                line_width=3
            )

        # add the LN predicted log-mean/stdev line
        parametric_values = results_dfs['parametric'][results_dfs['parametric']['Label'] == 'PredictedLog'][dm].values
        if dm in ['NSE', 'KGE']:
            # for NSE and KGE, we want to plot the CDF of the values
            parametric_values = 1 - parametric_values
        x, y = compute_empirical_cdf(parametric_values)

        f.line(
                x, y,
                line_color='black',
                line_dash='solid',
                legend_label='PredictedLog',
                line_width=3
            )
                    
        x_axis_label = r'$$D_\text{KL}(\text{KDE}||\text{LN})$$' if dm == 'DKL' else r'$$D_\text{EMD}(\text{KDE}||\text{KNN})$$'
        f.xaxis.axis_label = x_axis_label
        f.yaxis.axis_label = r'$$P(X\geq x)$$'
        f.legend.location = 'top_left'
        f.legend.click_policy = 'hide'
        f.legend.background_fill_alpha = 0.5
        f = dpf.format_fig_fonts(f, font_size=10)
        plots.append(f)
    return plots
# compare the mean and median of the KLD and EMD between time and frequency averaged ensembles
means, medians = {}, {}
for dm in ['KLD', 'EMD', 'RMSE', 'RE', 'NSE', 'KGE']:
    # get the knn results
    # for ensemble_method in ['timeEnsemble', 'freqEnsemble']:
    for ensemble_method in ['time', 'frequency']:
        results = results_dfs['lstm'].copy()
        results = results[results['Label'] == ensemble_method]
        # knn_results = knn_results[knn_results['ensemble_method'] == ensemble_method]
        # knn_results = knn_results[knn_results['tree_type'] == 'attribute']
        # knn_results = knn_results[knn_results['min_overlap'] == '0']
        
        # compute the mean and median for each k
        mean_vals = results[dm].mean()
        median_vals = results[dm].median()
        # mean_vals = knn_results.groupby('k')[dm].mean()
        # median_vals = knn_results.groupby('k')[dm].median()
        # make the index k integer
        # mean_vals.index = mean_vals.index.astype(int).sort_values()
        # median_vals.index = median_vals.index.astype(int).sort_values()
        # means[f'{ensemble_method}_{dm}'] = mean_vals
        # medians[f'{ensemble_method}_{dm}'] = median_vals
        print(f'For {ensemble_method} {dm}, mean={mean_vals:.2f}, median={median_vals:.2f}')

        # print(asdfsad)
# merge the results into a single dataframe
means_df = pd.DataFrame(means)
medians_df = pd.DataFrame(medians)
# means_df
For time KLD, mean=0.48, median=0.18
For frequency KLD, mean=0.26, median=0.11
For time EMD, mean=8.50, median=3.79
For frequency EMD, mean=8.09, median=3.62
For time RMSE, mean=13.51, median=5.85
For frequency RMSE, mean=12.61, median=5.57
For time RE, mean=1.77, median=0.21
For frequency RE, mean=1.12, median=0.20
For time NSE, mean=0.30, median=0.91
For frequency NSE, mean=0.08, median=0.93
For time KGE, mean=0.60, median=0.72
For frequency KGE, mean=0.61, median=0.74
# compute the difference between time and frequency at the catchment level for DKL and EMD
plots = []
# plot the difference as a function of the baseline metric value
p = figure(width=600, height=350, x_axis_type='log', y_axis_type='log')
colors = ['red', 'blue']
for i, dm in enumerate(['KLD', 'EMD']):
    results = results_dfs['lstm'].copy()
    tres = results[results['Label'] == 'time'][dm].values
    fres = results[results['Label'] == 'frequency'][dm].values
    slope, intercept, r_value, p_value, std_err = linregress(tres, fres)
    p.scatter(tres, fres, size=5, legend_label=f'{dm}', color=colors[i], alpha=0.5)
    x = np.linspace(0.01, 1.1 * max(tres), 100)
    p.line(x, slope * x + intercept, line_width=2, line_dash='dashed',
           color=colors[i], legend_label=f'{dm} fit slope={slope:.2f}: R²={r_value**2:.2f}')
# p.line([0.01, 1.1 * max(tres)], [0.01, 1.1 * max(tres)], line_width=2, line_dash='dashed', color='black', legend_label='1:1')
p.xaxis.axis_label = f'Time-averaged ensemble metric value'
p.yaxis.axis_label = 'Frequency-averaged ensemble metric value'
p.legend.location = 'bottom_right'
p.legend.click_policy = 'hide'
p = dpf.format_fig_fonts(p, font_size=14)
plots.append(p)
lt = gridplot(plots, ncols=2, width=500, height=350)
show(lt)
tree_type = 'attribute' # spatial or attribute ensemble selection
ensemble_type = 'timeEnsemble' # freqEnsemble or timeEnsemble for the averaging method
distance_metric = 'KLD'  # DKL or EMD for the distance metric
knn_plots = create_knn_plots(tree_type, ensemble_type, distance_metric, results_dfs)
lt = gridplot(knn_plots, ncols=3, width=425, height=425)
show(lt)
tree_type = 'attribute' # spatial or attribute ensemble selection
ensemble_type = 'freqEnsemble' # freqEnsemble or timeEnsemble for the averaging method
distance_metric = 'EMD'  # DKL or EMD for the distance metric
knn_plots = create_knn_plots(tree_type, ensemble_type, distance_metric, results_dfs)
lt = gridplot(knn_plots, ncols=3, width=425, height=425)
show(lt)
tree_type = 'attribute' # spatial or attribute ensemble selection
ensemble_type = 'timeEnsemble' # freqEnsemble or timeEnsemble for the averaging method
distance_metric = 'RE'  # DKL or EMD for the distance metric
knn_plots = create_knn_plots(tree_type, ensemble_type, distance_metric, results_dfs)
lt = gridplot(knn_plots, ncols=3, width=425, height=425)
show(lt)
tree_type = 'attribute' # spatial or attribute ensemble selection
ensemble_type = 'freqEnsemble' # freqEnsemble or timeEnsemble for the averaging method
distance_metric = 'RMSE'  # DKL or EMD for the distance metric
knn_plots = create_knn_plots(tree_type, ensemble_type, distance_metric, results_dfs)
lt = gridplot(knn_plots, ncols=3, width=425, height=425)
show(lt)
tree_type = 'attribute' # spatial or attribute ensemble selection
ensemble_type = 'timeEnsemble' # freqEnsemble or timeEnsemble for the averaging method
distance_metric = 'NSE'  # DKL or EMD for the distance metric
knn_plots = create_knn_plots(tree_type, ensemble_type, distance_metric, results_dfs)
lt = gridplot(knn_plots, ncols=3, width=425, height=425)
show(lt)
tree_type = 'attribute' # spatial or attribute ensemble selection
ensemble_type = 'freqEnsemble' # freqEnsemble or timeEnsemble for the averaging method
distance_metric = 'KGE'  # DKL or EMD for the distance metric
knn_plots = create_knn_plots(tree_type, ensemble_type, distance_metric, results_dfs)
lt = gridplot(knn_plots, ncols=3, width=425, height=425)
show(lt)

In temporal ensemble averaging, larger ensembles do worse at already poorly predicted locations by reducing variance.

The plots above compare ensembles of k NN averaged over the time domain, showing that across the (EW, IDW, and CAS) weighting/selection methods and across DKL and EMD divergence metrics, selecting just 1 neighbour yields poorer FDC approximations. The behaviour of the worst ~20% of the FDC approximations is counter-intuitive for the DKL metric because it would seem that adding contributors hedges against outliers at the expense of weakening the small number of cases where there happens to be a very good proxy.

Now keep all else constant and compare k neighbour ensembles in the frequency domain. Note that the maxk, strictk, effectivek, and dynamick variants are not compared in the frequency domain.

Across both DKL and EMD metrics, adding neighbours in the frequency domain averaging has the effect of hedging risk against outliers, with a penalty on the best 20% of predictions. For a small tradeoff in the best predictions, the poorest ~50% are improved in the KLD metric – however there is little or no effect in the worst performing percentile range of the EMD.

The predicted parametric is a more pronounced tradeoff between limiting downside risk at the expense of upside gain.

Next we look at the effect of varying the interpretation of kNN as far as handling gaps in the record. If the goal is to preserve as much information as possible, we can first relax the constraint of having exactly k consistent neighbours contribute observations concurrent with the target. We can go further by allowing the number of contributors to increase until we get an average of k observations per timestep, increasing the number of neighbours but holding the larger set constant. We can go even further and look for exactly k neighbours at each timestep, thereby not requiring the ensemble simulation to be drawn from the same set of neighbours.

Above, the downside risk hedging effect of the frequency-averaged ensemble increases with the number of neighbours based on the KL divergence metric, with a smaller penalty for the best approximations. The EMD metric shows a different trend, where it represents the best expected approximation compared to all other temporally-averaged ensembles, but it is counter-intuitive that the variants are so spread apart for small k. The log-mean and stdev parametric approximation still suppresses the tails and represents the “safest” bet for the worse half of the sample for both EMD and DKL metrics compard to the time ensemble. The best of each freqency-averaged ensemble is shown for comparison. For increasing k, the top few % of time-averaged ensembles are slighly better than the frequency-averaged ensembles.

We’ve kept the three neighbour selection / weighting methods (EW, IDW, CAS) separate so far. Now let’s compare them for equal k.

Selecting neighbours by attribute similarity (CAS) yields slightly better FDC predictions across the full sample.

For large scale assessments, there is a clear case for frequency over time-based ensembles, and we saw that the kNN method estimation performance was very sensitive to the strictness of adherence to the kNN in terms of the “effective number of contributors per timestep”. Now we take it to the logical extreme and remove the requirement that we compare concurrent periods of record. The idea being that a minimum of five years of observations at a location captures much of the inter-annual variability, including the El Nino period, but not longer durations like the PDO or longer term climate trends.

so to reduce the visual clutter we will keep the log-normal parametric approximation and the frequency-averaged ensemble based on catchment attribute similarity and add one last comparison, the LSTM-based approximation.

Multi-model ensembles#

Given the low rank correlation between certain methods, here we explore the potential benefits of combining multiple models as ensembles.

# load the baseline PMFs from the previous notebook
pmf_path = Path(os.getcwd()) / 'data' / 'results' / 'baseline_distributions' / f'bcub_pmfs.csv'
pmf_df = pd.read_csv(pmf_path, index_col=0)
pmf_stations = pmf_df.columns
assert np.all(np.isin(pmf_stations, station_ids)), "Not all stations with a baseline PMF are in the attribute station set."
LSTM_forcings_folder = '/home/danbot/neuralhydrology/data/BCUB_catchment_mean_met_forcings_20250320'
LSTM_ensemble_result_folder = '/home/danbot/code/neuralhydrology/data/ensemble_results'
attr_df_fpath = os.path.join('data', f'catchment_attributes_with_runoff_stats.csv')
exclude_pre_1980_data = True
baseline_distribution_folder = 'data/results/baseline_distributions'

methods = ('parametric', 'lstm', 'knn',)
# methods = ('knn',)
exclude_pre_1980_data = False  # use only stations with data 1980-present concurrent with Daymet
daymet_start_date = '1950-01-01'  # default start date for Daymet data
k_nearest = 10
if exclude_pre_1980_data:
    daymet_start_date = '1980-01-01'


input_data = {
    'attr_df_fpath': attr_df_fpath,
    'LSTM_forcings_folder': LSTM_forcings_folder,
    'LSTM_ensemble_result_folder': LSTM_ensemble_result_folder,
    'LSTM_concurrent_network': exclude_pre_1980_data,  # use only stations with data 1980-present concurrent with Daymet
    'daymet_start_date': daymet_start_date,
    # 'parameter_prediction_results_folder': parameter_prediction_results_folder,
    # 'predicted_param_dict': predicted_param_dict,
    'divergence_measures': ['DKL', 'EMD'],
    # 'baseline_pmf_stations': pmf_stations,
    'eps': 1e-12,
    'min_flow': 1e-4,
    'n_grid_points': 2**12,
    'min_record_length': 5,
    'minimum_days_per_month': 15,
    'parametric_target_cols': target_cols,
    'all_official_ids': station_ids,
    'daymet_concurrent_stations': daymet_concurrent_stations,
    'baseline_distribution_folder': baseline_distribution_folder,
    'baseline_pmf_stations': pmf_stations,
}

fdc_context = FDCEstimationContext(**input_data)
    Using all stations in the catchment data with a baseline PMF (validated): 1097
    ...overlap dict loaded from data/record_overlap_dict.json
def compute_multi_model_ensemble_pmf(stn, which_models):
    # load the knn_result
    knn_fpath = os.path.join(results_folder, 'knn', f'{stn}_fdc_results.json')
    with open(knn_fpath, 'rb') as file:
        knn_dict = json.load(file)
        # retrieve the PMF for the 4_NN_0_minOverlapPct_attribute_dist_ID2
        knn_models = list(knn_dict.keys())
        knn_model = [k for k in knn_models if '4_NN_attribute_dist_ID2_freqEnsemble' in k]
        assert knn_model, f'No knn model found for {stn}'
        knn_pmf = knn_dict[knn_model[0]]['pmf']
        
    lstm_fpath = os.path.join(results_folder, 'lstm', f'{stn}_fdc_results.json')
    with open(lstm_fpath, 'rb') as file:
        lstm_dict = json.load(file)
        lstm_pmf = lstm_dict['Frequency']['pmf']

    param_fpath = os.path.join(results_folder, 'parametric', f'{stn}_fdc_results.json')
    with open(param_fpath, 'rb') as file:
        param_dict = json.load(file)
        # retrieve the PMF for the 'PredictedMOM' model
        param_models = list(param_dict.keys())
        param_model = [k for k in param_models if 'PredictedMOM' in k]
        assert param_model, f'No parametric model found for {stn}'
        param_pmf = param_dict[param_model[0]]['pmf']
    # compute an ensemble PMF as the average of the knn and lstm PMFs
    # compute the mean ensemble element-wise
    if which_models == 'knn-lstm':
        ensemble_pmf = np.mean([knn_pmf, lstm_pmf], axis=0)
    elif which_models == 'knn-lstm-parametric':
        ensemble_pmf = np.mean([knn_pmf, lstm_pmf, param_pmf], axis=0)
    ensemble_pmf = ensemble_pmf / np.sum(ensemble_pmf)  # normalize to sum to 1
    return ensemble_pmf
def compute_ensemble_divergence(stn, which_models):
    station = StationData(fdc_context, stn)
    kde = KDEEstimator(station.baseline_log_grid, station.log_dx)
    baseline_pmf, _ = kde.compute(
            station.stn_df[station.uar_label].values, station.target_da
        )
    ensemble_pmf = compute_multi_model_ensemble_pmf(stn, which_models=which_models)
    dkl = station._compute_kld(baseline_pmf, ensemble_pmf)
    emd = station._compute_emd(baseline_pmf, ensemble_pmf)
    uniform_pmf = np.ones_like(baseline_pmf) / len(baseline_pmf)
    dkl_uniform = station._compute_kld(baseline_pmf, uniform_pmf)
    emd_uniform = station._compute_emd(baseline_pmf, uniform_pmf)
    assert np.allclose(np.sum(baseline_pmf), 1)
    assert np.allclose(np.sum(ensemble_pmf), 1)
    del station 
    del kde
    return dkl, emd, dkl_uniform, emd_uniform
ensembles = []
ensemble_fpath = f'data/results/knn_lstm_ensemble_results.csv'
ensemble_fpath = f'data/results/knn_lstm_logNorm_ensemble_results.csv'

process_ensemble = False
if process_ensemble:
    for stn in completed_stns:
        dkl, emd, dkl_uniform, emd_uniform = compute_ensemble_divergence(stn, which_models='knn-lstm-parametric')
        ensembles.append((stn, dkl, emd, dkl_uniform, emd_uniform))
        if len(ensembles) % 50 == 0:
            print(f'Processed {len(ensembles)}/{len(completed_stns)} stations')

    ensemble_df = pd.DataFrame(ensembles, columns=['Official_ID', 'DKL', 'EMD', 'DKL_Uniform', 'EMD_Uniform'])
    ensemble_df.to_csv(ensemble_fpath, index=False)
# edf1 = pd.read_csv('data/results/knn_lstm_ensemble_results.csv')
# edf1.set_index('Official_ID', inplace=True)
# edf1.columns = ['DKL_e2', 'EMD_e2', 'DKL_Uniform_e2', 'EMD_Uniform_e2']
# edf = pd.read_csv('data/results/knn_lstm_logNorm_ensemble_results.csv')
# edf.set_index('Official_ID', inplace=True)
# edf.columns = ['DKL_e3', 'EMD_e3', 'DKL_Uniform_e3', 'EMD_Uniform_e3']
# ensemble_df = pd.concat([edf, edf1], axis=1)
# ensemble_df.head()
# ensemble_df.head()
# ensemble_df['DKL_fail'] = ensemble_df['DKL_Uniform_e2'] < ensemble_df['DKL_e2']
# ensemble_df['EMD_fail'] = ensemble_df['EMD_Uniform_e2'] < ensemble_df['EMD_e2']
# dkl_fail_count = ensemble_df['DKL_fail'].sum()
# emd_fail_count = ensemble_df['EMD_fail'].sum()
# print(f'Ensemble DKL fails: {dkl_fail_count} ({dkl_fail_count/len(ensemble_df)*100:.2f}%)')
# print(f'Ensemble EMD fails: {emd_fail_count} ({emd_fail_count/len(ensemble_df)*100:.2f}%)')
# dkl_fails = ensemble_df[ensemble_df['DKL_fail']].copy()
# dkl_fails

Check the percent change in land cover over 2010 to 2020 compared to the#

bcub_attrs_fname = Path('data') / 'BCUB_watershed_attributes_updated_20250227.csv'
bcub_df = pd.read_csv(bcub_attrs_fname, dtype={'official_id': str, 'watershed_id': str})
bcub_df = bcub_df[[c for c in bcub_df.columns if c not in ['Unnamed: 0']]]
bcub_df['forest_change'] = bcub_df['land_use_forest_frac_2010'] - bcub_df['land_use_forest_frac_2020']
bcub_df['ice_change'] = bcub_df['land_use_snow_ice_frac_2010'] - bcub_df['land_use_snow_ice_frac_2020']
bcub_df['water_change'] = bcub_df['land_use_water_frac_2010'] - bcub_df['land_use_water_frac_2020']
# plot the distribution of change values:
f = figure(title=f'Distribution of land cover change (2010-2020)', width=600, height=400)
for i, s in enumerate(['forest_change', 'ice_change', 'water_change']):
    x, y = compute_empirical_cdf(bcub_df[s].values)
    f.line(x, y, line_width=2, color=Bokeh6[2*i], legend_label=s)
f.xaxis.axis_label = '% change'
f.yaxis.axis_label = 'P(x)'
f.legend.location = 'top_left'
f.legend.click_policy = 'hide'
f = dpf.format_fig_fonts(f, font_size=14)
show(f)
bcub_df.head()
len(bcub_df)
1308
# plot a correlation between land cover change and the DKL/EMD values
# models = list(main_result_vals.keys())
figs = []
for dm in ['DKL', 'EMD']:
    f = figure(title=f'Correlation between land cover change and {dm}', width=600, height=400, y_axis_type='log')
    # models = [m for m in models if m.endswith(f'_{dm}')]
    models = [f'PredictedLog_{dm}', f'kNN1980_{dm}', f'LSTM_freq_{dm}'] 
    for i, model in enumerate(models):    
        data = main_result_vals[f'LSTM_freq_{dm}'].copy()
        # convert to a dict with the ids as keys
        model_dict = dict(zip(data['ids'], data['values']))
        # map the model values to the bcub_df
        model_df = bcub_df.copy()
        model_df[model] = model_df['official_id'].map(model_dict, None)
        model_df = model_df[model_df[model].notna()].copy()
        data = model_df[[model, 'forest_change']]
        f.scatter(data['forest_change'], data[model], size=5, color=Bokeh6[i], legend_label=model, alpha=0.4)
        slope, intercept, r_value, p_value, std_err = linregress(data['forest_change'], data[model])
        x = np.linspace(data['forest_change'].min(), data['forest_change'].max(), 100)
        y = slope * x + intercept
        f.line(x, y, line_width=2, color=Bokeh6[2*i], legend_label=f'{model} (R²={r_value**2:.2f})')
    f.xaxis.axis_label = 'Forest Change (2010-2020)'
    f.yaxis.axis_label = f'{dm} Value'
    f.legend.location = 'top_left'
    f.legend.click_policy = 'hide'
    f.legend.background_fill_alpha = 0.5
    f = dpf.format_fig_fonts(f, font_size=14)
    figs.append(f)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[59], line 9
      7 models = [f'PredictedLog_{dm}', f'kNN1980_{dm}', f'LSTM_freq_{dm}'] 
      8 for i, model in enumerate(models):    
----> 9     data = main_result_vals[f'LSTM_freq_{dm}'].copy()
     10     # convert to a dict with the ids as keys
     11     model_dict = dict(zip(data['ids'], data['values']))

KeyError: 'LSTM_freq_DKL'
lt = gridplot(figs, ncols=2, width=600, height=400)
show(lt)

Distribution Matching of Ensemble Time Series via Quantile Transformation#

Goal:#

Adjust time-ensemble mean or members \(( X(t, m) )\) such that the marginal distribution of predictions matches a target PDF while preserving temporal structure (autocorrelation).

Given:#

  • \( X(t, m) \): Time series predictions of shape \( T \times M \)

  • \( f_{\text{time}}(x) \): PDF estimated from time-averaged ensemble (e.g., \( \bar{X}(t) = \frac{1}{M} \sum_m X(t, m) \))

  • \( f_{\text{freq}}(x) \): PDF estimated from frequency-averaged ensemble (i.e., mean PDF across members)

  • \( F_{\text{time}}(x) = \int_{-\infty}^x f_{\text{time}}(s) ds\): CDF of time-ensemble

  • \( F_{\text{freq}}^{-1}(u) \): Inverse CDF (quantile function) of frequency ensemble

Transformation:#

For each prediction value \( x = X(t, m) \), apply:

\[\tilde{X}(t, m) = F_{\text{freq}}^{-1}(F_{\text{time}}(X(t, m)))\]

This mapping:

  • Preserves rank of values (and thus autocorrelation)

  • Transforms the marginal PDF of \(X(t, m) \) to match \(f_{\text{freq}}\)

Notes:#

  • \(F_{\text{time}} \) and \(F_{\text{freq}}^{-1} \) may be constructed from empirical CDFs or smoothed (e.g. via KDE)

  • Valid under assumption that predicted values are continuous and strictly monotonic